Packages
library(DT)
library(adabag)
FALSE Loading required package: rpart
FALSE Loading required package: caret
FALSE Loading required package: ggplot2
FALSE Loading required package: lattice
FALSE Loading required package: foreach
FALSE Loading required package: doParallel
FALSE Loading required package: iterators
FALSE Loading required package: parallel
library(rpart.plot)
library(pROC)
FALSE Type 'citation("pROC")' for a citation.
FALSE
FALSE Attaching package: 'pROC'
FALSE The following objects are masked from 'package:stats':
FALSE
FALSE cov, smooth, var
library(summarytools)
library(corrplot)
FALSE corrplot 0.92 loaded
library(dplyr)
FALSE
FALSE Attaching package: 'dplyr'
FALSE The following objects are masked from 'package:stats':
FALSE
FALSE filter, lag
FALSE The following objects are masked from 'package:base':
FALSE
FALSE intersect, setdiff, setequal, union
library(GGally)
FALSE Registered S3 method overwritten by 'GGally':
FALSE method from
FALSE +.gg ggplot2
library(fastDummies)
library(ggcorrplot)
library(klaR)
FALSE Loading required package: MASS
FALSE
FALSE Attaching package: 'MASS'
FALSE The following object is masked from 'package:dplyr':
FALSE
FALSE select
library(psych)
FALSE
FALSE Attaching package: 'psych'
FALSE The following objects are masked from 'package:ggplot2':
FALSE
FALSE %+%, alpha
library(MASS)
# library(ggord)
library(devtools)
FALSE Loading required package: usethis
library(ggplot2)
library(ggthemes)
library(GGally)
library(caret)
library(splitTools)
library(rpart)
library(xgboost)
FALSE
FALSE Attaching package: 'xgboost'
FALSE The following object is masked from 'package:dplyr':
FALSE
FALSE slice
library(caTools)
library(dplyr)
library(caret)
library(naniar)
CM_Function <- function(cm) {
layout(matrix(c(1,1,2)))
par(mar=c(2,2,2,2))
plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
title('CONFUSION MATRIX', cex.main=2)
# create the matrix
rect(150, 430, 240, 370, col='#2F4F4E')
text(195, 435, 'No', cex=1.2)
rect(250, 430, 340, 370, col='#0D8387')
text(295, 435, 'Yes', cex=1.2)
text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
text(245, 450, 'Actual', cex=1.3, font=2)
rect(150, 305, 240, 365, col='#0D8387')
rect(250, 305, 340, 365, col='#2F4F4E')
text(140, 400, 'No', cex=1.2, srt=90)
text(140, 335, 'Yes', cex=1.2, srt=90)
# add in the cm results
res <- as.numeric(cm$table)
text(195, 400, res[1], cex=1.6, font=2, col='white')
text(195, 335, res[2], cex=1.6, font=2, col='white')
text(295, 400, res[3], cex=1.6, font=2, col='white')
text(295, 335, res[4], cex=1.6, font=2, col='white')
# add in the specifics
plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)
# add in the accuracy information
text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}
#options(repos = c(
#fawda123 = 'https://fawda123.r-universe.dev',
#CRAN = 'https://cloud.r-project.org'))
# Install ggord
#install.packages('ggord')
Data and Feature Engineering
df <- readxl::read_xls('Cchurn.xls')
df$international_plan <- factor(df$international_plan, levels = c('no', 'yes'), labels = c('0','1'))
df$voice_mail_plan <- factor(df$voice_mail_plan, levels = c('no', 'yes'), labels = c('0','1'))
df$churn <- factor(df$churn, levels = c('no', 'yes'), labels = c('0','1'))
Summary
print(summarytools::dfSummary(df), method = 'render')
- We have no missing values -> perfect
- Heavily uneven counts of dependent variable (86 % no / 14 % yes)
-> maybe sample for equality / maybe not because we loose information
of other data
- Independent variables are on different scales -> standardize
- two (maybe three) categorical predictors: International plan /
voice_mail_plan (/ maybe number_customer_service_calls) -> dummy
encode -> not necessary as already 0 and 1
- Rest of data is numeric and most of the variables looks normally
distributed with exception of number_vmail_messages and totat_intl_calls
- transform these value to make them normal?
- maybe make parts of them categorical? (recieving voice mail or not,
calling internationally or not)
- or maybe the categorical values that we have already give an
indication for this
- Test normality of variables
- Can variables be combined? We have day / eve / night / intl calls
and for each of them minutes / calls / charge. Maybe we can combine this
into one metric. Maybe average cost per minute or average cost per
call?
Correlation Plot
df_numeric <- select_if(df, is.numeric) # Subset numeric columns with dplyr
M <- cor(df_numeric)
p.mat <- cor_pmat(df_numeric)
ggcorrplot(M, hc.order = TRUE, type = "lower", lab = TRUE, p.mat = p.mat, sig.level=0.05, lab_size = 2, tl.cex = 10,outline.col = "white", ggtheme = ggplot2::theme_minimal(), colors = c("#823038", "white", "#2596be"))

Proves theory from before -> we can make one metric out of charge
and minutes –> charge / minutes
Data Engineering
df$total_day_charge_per_minute <- ifelse(df$total_day_minutes == 0, 0, df$total_day_charge / df$total_day_minutes)
df$total_eve_charge_per_minute <- ifelse(df$total_eve_minutes == 0, 0, df$total_eve_charge / df$total_eve_minutes)
df$total_night_charge_per_minute <- ifelse(df$total_night_minutes == 0, 0, df$total_night_charge / df$total_night_minutes)
df$total_intl_charge_per_minute <- ifelse(df$total_intl_minutes == 0, 0, df$total_intl_charge / df$total_intl_minutes)
df <- subset(df, select = -c(total_day_charge, total_day_minutes, total_eve_charge, total_eve_minutes, total_night_charge, total_night_minutes, total_intl_charge, total_intl_minutes))
Correlation Plot
df_numeric <- select_if(df, is.numeric) # Subset numeric columns with dplyr
M <- cor(df_numeric)
p.mat <- cor_pmat(df_numeric)
ggcorrplot(M, hc.order = TRUE, type = "lower", lab = TRUE, p.mat = p.mat, sig.level=0.05, lab_size = 2, tl.cex = 10,outline.col = "white", ggtheme = ggplot2::theme_minimal(), colors = c("#823038", "white", "#2596be"))

Now we have non-correlated data
Relationship between variables
# theme_set(theme_minimal())
#
# ggpairs(
# data = df,
# columns = c(1:9,11:14),
# mapping = aes(col = churn, alpha = .9)
# ) +
# scale_fill_colorblind() +
# scale_color_colorblind()
We see that data is hard to seperate linearly between the classes.
Therefore one can introduce new features of higher order or use methods
which do not need the data to be seperable linearly.
Adding features of higher order
Only squaring as we have no negative data. Cubing would be needed
with negative data.
# squared
df2 <- df^2
df2 <- df2[,-c(2,3,10)]
colnames(df2) <- paste0(colnames(df2), '_sqd')
df <- cbind(df,df2)
Relationship between data in higher order
# theme_set(theme_minimal())
#
# ggpairs(
# data = df,
# columns = c(1:9, 11:25),
# mapping = aes(col = churn, alpha = .9)
# ) +
# scale_fill_colorblind() +
# scale_color_colorblind()
Sampling Methods
As we have unbalanced data we need to use a sampling method to
balance the classes. Hereby there are four different methods. OVER /
UNDER / BOTH / ROSE.
library(ROSE)
FALSE Loaded ROSE 0.0-4
# OVER
df_OVER <- ovun.sample(churn~., data = df, method = "over")$data
table(df$churn)
FALSE
FALSE 0 1
FALSE 4293 707
table(df_OVER$churn)
FALSE
FALSE 0 1
FALSE 4293 4338
# UNDER
df_UNDER <- ovun.sample(churn~., data = df, method = "under")$data
table(df$churn)
FALSE
FALSE 0 1
FALSE 4293 707
table(df_UNDER$churn)
FALSE
FALSE 0 1
FALSE 701 707
# BOTH
df_BOTH <- ovun.sample(churn~., data = df, method = "both")$data
table(df$churn)
FALSE
FALSE 0 1
FALSE 4293 707
table(df_BOTH$churn)
FALSE
FALSE 0 1
FALSE 2503 2497
# ROSE
df_ROSE <- ROSE(churn ~ ., data = df, seed = 1, p = 0.5)$data
Sampling Visualization
# theme_set(theme_minimal())
#
# ggpairs(
# data = df_ROSE,
# columns = c(1:9, 11:25),
# mapping = aes(col = churn, alpha = .9)
# ) +
# scale_fill_colorblind() +
# scale_color_colorblind() +
# labs(title = "Machine Learning Project")
#
# ggpairs(
# data = df_OVER,
# columns = c(1:9, 11:25),
# mapping = aes(col = churn, alpha = .9)
# ) +
# scale_fill_colorblind() +
# scale_color_colorblind() +
# labs(title = "Machine Learning Project")
# ggpairs(
# data = df_UNDER,
# columns = c(1:9, 11:25),
# mapping = aes(col = churn, alpha = .9)
# ) +
# scale_fill_colorblind() +
# scale_color_colorblind() +
# labs(title = "Machine Learning Project")
# ggpairs(
# data = df_BOTH,
# columns = c(1:9, 11:25),
# mapping = aes(col = churn, alpha = .9)
# ) +
# scale_fill_colorblind() +
# scale_color_colorblind() +
# labs(title = "Machine Learning Project")
Train / Test split
As we need to test the models we need to split the sampled data.
set.seed(1)
data <- df_OVER # choose which data to use df_ROSE / df_BOTH / df_UNDER / df_OVER / df
inds <- splitTools::partition(data$churn, p = c(train = 0.7, test = 0.3))
dftrain <- data[inds$train,]
dftest <- data[inds$test,]
Standardizing
As some methods need scaled data we scale the data here to be
centered.
norm.value <- preProcess(dftrain, method = c("center", "scale"))
dftrain <- predict(norm.value, dftrain)
dftest <- predict(norm.value, dftest)
Predicting Models
neural net
# dftrain <- dftrain |>
# mutate_if(is.factor, as.character) |>
# mutate_if(is.character, as.numeric)
#
# library(neuralnet)
# mod.neural <- neuralnet(churn ~ ., data = dftrain, hidden=c(15,15), linear.output = FALSE)
#
# predicted.neural <- predict(mod.neural, dftest[,-c(10)])
#
# confmat.neural <- confusionMatrix(data=predicted.neural, reference = dftest$churn, positive = '1')
#
# CM_Function(confmat.neural)
#
# roc_score.neural =roc(factor(dftest$churn, ordered=TRUE), factor(predicted.neural, ordered=TRUE))
# plot(roc_score.neural ,main ="ROC curve")
Boosting
set.seed(123)
# train bagged model
mod.boost <- boosting(churn ~., data=dftrain)
predicted.boost <- factor(predict(mod.boost, dftest, type="class")$class)
confmat.boost <- confusionMatrix(data=predicted.boost, reference = dftest$churn, positive = '1')
CM_Function(confmat.boost)

roc_score.boost =roc(factor(dftest$churn, ordered=TRUE), factor(predicted.boost, ordered=TRUE))
FALSE Setting levels: control = 0, case = 1
FALSE Setting direction: controls < cases
plot(roc_score.boost ,main ="ROC curve")

Ctree
tree_full <- rpart(churn ~ .,
data = dftrain,
method = "class", # "class" because Y is a binary factor
minbucket = 1,
cp = 0.00001)
# Plot tree
rpart.plot(tree_full, yesno = TRUE, digits =-6)

min_xerr<- which.min(tree_full$cptable[,"xerror"]) # select minimum cross-validation error
cp_bp <- tree_full$cptable[min_xerr,"CP"] # find the corresponding CP value, to get the "best pruned " tree
mod.pruned_tree<- prune(tree_full, cp = cp_bp) # re-compute the tree with the selected Cp
rpart.plot(mod.pruned_tree, yesno = TRUE, digits =-3)

predicted.pruned_tree <- predict(mod.pruned_tree, dftest[,-c(10)], type = "class")
confmat.prunned_tree <- confusionMatrix(data=predicted.pruned_tree, reference = dftest$churn, positive = '1')
CM_Function(confmat.prunned_tree)

roc_score.prunned_tree =roc(factor(dftest$churn, ordered=TRUE), factor(predicted.pruned_tree, ordered=TRUE))
FALSE Setting levels: control = 0, case = 1
FALSE Setting direction: controls < cases
plot(roc_score.prunned_tree ,main ="ROC curve")

Bagging
set.seed(123)
library(ipred)
FALSE
FALSE Attaching package: 'ipred'
FALSE The following object is masked from 'package:adabag':
FALSE
FALSE bagging
# train bagged model
ames_bag1 <- bagging(
formula = churn ~ .,
data = dftrain,
nbagg = 100,
coob = TRUE,
control = rpart.control(minsplit = 2, cp = 0)
)
ames_bag1
FALSE
FALSE Bagging classification trees with 100 bootstrap replications
FALSE
FALSE Call: bagging.data.frame(formula = churn ~ ., data = dftrain, nbagg = 100,
FALSE coob = TRUE, control = rpart.control(minsplit = 2, cp = 0))
FALSE
FALSE Out-of-bag estimate of misclassification error: 0.0442
predicted <- factor(ifelse(predict(ames_bag1, dftest[,-c(10)], type = 'prob')[,2] >= 0.5, 1, 0))
CM_Function(confusionMatrix(data=predicted, reference = dftest$churn, positive = '1'))

roc_score=roc(factor(dftest$churn, ordered=TRUE), factor(predicted, ordered=TRUE)) #AUC score
FALSE Setting levels: control = 0, case = 1
FALSE Setting direction: controls < cases
plot(roc_score ,main ="ROC curve")

knn
set.seed(1)
df <- data.frame(k = seq(1, 30, 1), accuracy = rep(0, 30), sensitivity = rep(0, 30))
# iterating over different ks
for(i in 1:30){
# nearest neighbor
KNN1 <- knn3(y = dftrain$churn, x = dftrain[,-c(10)], k = i)
# predictions response
KNN1.pred.valid.resp <- predict(KNN1, dftest[,-c(10)], type = "class")
# predictions prob
KNN1.pred.valid.prob <- predict(KNN1, dftest[,-c(10)], type = "prob")[,2]
# Confusionmatrix
df$sensitivity[i] <- confusionMatrix(KNN1.pred.valid.resp, dftest$churn, positive = "1")$byClass[1]
df$accuracy[i] <- confusionMatrix(KNN1.pred.valid.resp, dftest$churn, positive = "1")$overall[1]
}
# plot the k's
ggplot(df, aes(x=k)) +
geom_line(aes(y = sensitivity, colour = "Sensitivity")) +
geom_line(aes(y = accuracy, colour = "Accuracy")) +
labs(x = "Number of k nearest neighbours",
y = "Accuracy / Sensitivity", title = "Accuracy / Sensitivity regarding k") +
theme_minimal() +
scale_y_continuous(name = "Sensitivity / Accuracy", limits = c(0.7, 1)) +
scale_color_manual(name = "Values", values = c("Sensitivity" = "darkblue", "Accuracy" = "red")) +
xlim (1, 30)

mod.knn <- knn3(y = dftrain$churn, x = dftrain[,-c(10)], k = 2)
predicted.knn <- predict(mod.knn, dftest[,-c(10)], type = "class")
confmat.knn <- confusionMatrix(data=predicted.knn, reference = dftest$churn, positive = '1')
CM_Function(confmat.knn)

roc_score.qda =roc(factor(dftest$churn, ordered=TRUE), factor(predicted.knn, ordered=TRUE))
FALSE Setting levels: control = 0, case = 1
FALSE Setting direction: controls < cases
plot(roc_score.qda ,main ="ROC curve")

QDA
mod.qda <- qda(churn ~., data = dftrain)
predicted.qda <- predict(mod.qda, dftest[,-c(10)])$class
confmat.qda <- confusionMatrix(data=predicted.qda, reference = dftest$churn, positive = '1')
CM_Function(confmat.qda)

roc_score.qda =roc(factor(dftest$churn, ordered=TRUE), factor(predicted.qda, ordered=TRUE))
FALSE Setting levels: control = 0, case = 1
FALSE Setting direction: controls < cases
plot(roc_score.qda ,main ="ROC curve")

QLOG
mod.log <- glm(churn ~., data = dftrain, family = binomial(link = "probit"))
s <- step(mod.log)
FALSE Start: AIC=6873.98
FALSE churn ~ account_length + international_plan + voice_mail_plan +
FALSE number_vmail_messages + total_day_calls + total_eve_calls +
FALSE total_night_calls + total_intl_calls + number_customer_service_calls +
FALSE total_day_charge_per_minute + total_eve_charge_per_minute +
FALSE total_night_charge_per_minute + total_intl_charge_per_minute +
FALSE account_length_sqd + number_vmail_messages_sqd + total_day_calls_sqd +
FALSE total_eve_calls_sqd + total_night_calls_sqd + total_intl_calls_sqd +
FALSE number_customer_service_calls_sqd + total_day_charge_per_minute_sqd +
FALSE total_eve_charge_per_minute_sqd + total_night_charge_per_minute_sqd +
FALSE total_intl_charge_per_minute_sqd
FALSE
FALSE Df Deviance AIC
FALSE - account_length_sqd 1 6824.0 6872.0
FALSE - total_day_charge_per_minute 1 6824.0 6872.0
FALSE - total_day_charge_per_minute_sqd 1 6824.0 6872.0
FALSE - account_length 1 6824.2 6872.2
FALSE - total_night_calls_sqd 1 6824.4 6872.4
FALSE - total_night_calls 1 6824.7 6872.7
FALSE - total_intl_charge_per_minute 1 6825.4 6873.4
FALSE - total_intl_charge_per_minute_sqd 1 6825.7 6873.7
FALSE <none> 6824.0 6874.0
FALSE - total_eve_calls 1 6826.6 6874.6
FALSE - total_eve_calls_sqd 1 6826.9 6874.9
FALSE - total_night_charge_per_minute_sqd 1 6827.4 6875.4
FALSE - total_day_calls 1 6827.5 6875.5
FALSE - total_night_charge_per_minute 1 6827.5 6875.5
FALSE - total_day_calls_sqd 1 6829.3 6877.3
FALSE - number_vmail_messages_sqd 1 6830.4 6878.4
FALSE - number_vmail_messages 1 6834.5 6882.5
FALSE - number_customer_service_calls 1 6848.8 6896.8
FALSE - total_intl_calls_sqd 1 6849.2 6897.2
FALSE - voice_mail_plan 1 6854.4 6902.4
FALSE - total_eve_charge_per_minute 1 6855.4 6903.4
FALSE - total_eve_charge_per_minute_sqd 1 6855.4 6903.4
FALSE - total_intl_calls 1 6855.7 6903.7
FALSE - number_customer_service_calls_sqd 1 6993.9 7041.9
FALSE - international_plan 1 7418.9 7466.9
FALSE
FALSE Step: AIC=6871.99
FALSE churn ~ account_length + international_plan + voice_mail_plan +
FALSE number_vmail_messages + total_day_calls + total_eve_calls +
FALSE total_night_calls + total_intl_calls + number_customer_service_calls +
FALSE total_day_charge_per_minute + total_eve_charge_per_minute +
FALSE total_night_charge_per_minute + total_intl_charge_per_minute +
FALSE number_vmail_messages_sqd + total_day_calls_sqd + total_eve_calls_sqd +
FALSE total_night_calls_sqd + total_intl_calls_sqd + number_customer_service_calls_sqd +
FALSE total_day_charge_per_minute_sqd + total_eve_charge_per_minute_sqd +
FALSE total_night_charge_per_minute_sqd + total_intl_charge_per_minute_sqd
FALSE
FALSE Df Deviance AIC
FALSE - total_day_charge_per_minute 1 6824.0 6870.0
FALSE - total_day_charge_per_minute_sqd 1 6824.0 6870.0
FALSE - total_night_calls_sqd 1 6824.4 6870.4
FALSE - total_night_calls 1 6824.7 6870.7
FALSE - total_intl_charge_per_minute 1 6825.5 6871.5
FALSE - total_intl_charge_per_minute_sqd 1 6825.7 6871.7
FALSE <none> 6824.0 6872.0
FALSE - total_eve_calls 1 6826.6 6872.6
FALSE - total_eve_calls_sqd 1 6826.9 6872.9
FALSE - total_night_charge_per_minute_sqd 1 6827.5 6873.5
FALSE - total_day_calls 1 6827.5 6873.5
FALSE - total_night_charge_per_minute 1 6827.6 6873.6
FALSE - account_length 1 6827.9 6873.9
FALSE - total_day_calls_sqd 1 6829.3 6875.3
FALSE - number_vmail_messages_sqd 1 6830.4 6876.4
FALSE - number_vmail_messages 1 6834.5 6880.5
FALSE - number_customer_service_calls 1 6848.8 6894.8
FALSE - total_intl_calls_sqd 1 6849.2 6895.2
FALSE - voice_mail_plan 1 6854.4 6900.4
FALSE - total_eve_charge_per_minute 1 6855.4 6901.4
FALSE - total_eve_charge_per_minute_sqd 1 6855.4 6901.4
FALSE - total_intl_calls 1 6855.7 6901.7
FALSE - number_customer_service_calls_sqd 1 6994.0 7040.0
FALSE - international_plan 1 7419.0 7465.0
FALSE
FALSE Step: AIC=6870.01
FALSE churn ~ account_length + international_plan + voice_mail_plan +
FALSE number_vmail_messages + total_day_calls + total_eve_calls +
FALSE total_night_calls + total_intl_calls + number_customer_service_calls +
FALSE total_eve_charge_per_minute + total_night_charge_per_minute +
FALSE total_intl_charge_per_minute + number_vmail_messages_sqd +
FALSE total_day_calls_sqd + total_eve_calls_sqd + total_night_calls_sqd +
FALSE total_intl_calls_sqd + number_customer_service_calls_sqd +
FALSE total_day_charge_per_minute_sqd + total_eve_charge_per_minute_sqd +
FALSE total_night_charge_per_minute_sqd + total_intl_charge_per_minute_sqd
FALSE
FALSE Df Deviance AIC
FALSE - total_night_calls_sqd 1 6824.5 6868.5
FALSE - total_night_calls 1 6824.7 6868.7
FALSE - total_intl_charge_per_minute 1 6825.5 6869.5
FALSE - total_intl_charge_per_minute_sqd 1 6825.7 6869.7
FALSE <none> 6824.0 6870.0
FALSE - total_eve_calls 1 6826.7 6870.7
FALSE - total_eve_calls_sqd 1 6826.9 6870.9
FALSE - total_night_charge_per_minute_sqd 1 6827.5 6871.5
FALSE - total_day_calls 1 6827.5 6871.5
FALSE - total_night_charge_per_minute 1 6827.6 6871.6
FALSE - account_length 1 6828.0 6872.0
FALSE - total_day_calls_sqd 1 6829.3 6873.3
FALSE - number_vmail_messages_sqd 1 6830.5 6874.5
FALSE - number_vmail_messages 1 6834.6 6878.6
FALSE - total_day_charge_per_minute_sqd 1 6836.5 6880.5
FALSE - number_customer_service_calls 1 6848.9 6892.9
FALSE - total_intl_calls_sqd 1 6849.3 6893.3
FALSE - voice_mail_plan 1 6854.4 6898.4
FALSE - total_eve_charge_per_minute 1 6855.4 6899.4
FALSE - total_eve_charge_per_minute_sqd 1 6855.4 6899.4
FALSE - total_intl_calls 1 6855.7 6899.7
FALSE - number_customer_service_calls_sqd 1 6994.0 7038.0
FALSE - international_plan 1 7419.0 7463.0
FALSE
FALSE Step: AIC=6868.46
FALSE churn ~ account_length + international_plan + voice_mail_plan +
FALSE number_vmail_messages + total_day_calls + total_eve_calls +
FALSE total_night_calls + total_intl_calls + number_customer_service_calls +
FALSE total_eve_charge_per_minute + total_night_charge_per_minute +
FALSE total_intl_charge_per_minute + number_vmail_messages_sqd +
FALSE total_day_calls_sqd + total_eve_calls_sqd + total_intl_calls_sqd +
FALSE number_customer_service_calls_sqd + total_day_charge_per_minute_sqd +
FALSE total_eve_charge_per_minute_sqd + total_night_charge_per_minute_sqd +
FALSE total_intl_charge_per_minute_sqd
FALSE
FALSE Df Deviance AIC
FALSE - total_night_calls 1 6825.9 6867.9
FALSE - total_intl_charge_per_minute 1 6826.0 6868.0
FALSE - total_intl_charge_per_minute_sqd 1 6826.2 6868.2
FALSE <none> 6824.5 6868.5
FALSE - total_eve_calls 1 6827.1 6869.1
FALSE - total_eve_calls_sqd 1 6827.4 6869.4
FALSE - total_night_charge_per_minute_sqd 1 6827.9 6869.9
FALSE - total_day_calls 1 6828.1 6870.1
FALSE - total_night_charge_per_minute 1 6828.1 6870.1
FALSE - account_length 1 6828.5 6870.5
FALSE - total_day_calls_sqd 1 6829.9 6871.9
FALSE - number_vmail_messages_sqd 1 6830.9 6872.9
FALSE - number_vmail_messages 1 6835.0 6877.0
FALSE - total_day_charge_per_minute_sqd 1 6836.8 6878.8
FALSE - number_customer_service_calls 1 6849.3 6891.3
FALSE - total_intl_calls_sqd 1 6849.6 6891.6
FALSE - voice_mail_plan 1 6854.8 6896.8
FALSE - total_eve_charge_per_minute 1 6856.1 6898.1
FALSE - total_eve_charge_per_minute_sqd 1 6856.1 6898.1
FALSE - total_intl_calls 1 6856.2 6898.2
FALSE - number_customer_service_calls_sqd 1 6994.3 7036.3
FALSE - international_plan 1 7419.8 7461.8
FALSE
FALSE Step: AIC=6867.93
FALSE churn ~ account_length + international_plan + voice_mail_plan +
FALSE number_vmail_messages + total_day_calls + total_eve_calls +
FALSE total_intl_calls + number_customer_service_calls + total_eve_charge_per_minute +
FALSE total_night_charge_per_minute + total_intl_charge_per_minute +
FALSE number_vmail_messages_sqd + total_day_calls_sqd + total_eve_calls_sqd +
FALSE total_intl_calls_sqd + number_customer_service_calls_sqd +
FALSE total_day_charge_per_minute_sqd + total_eve_charge_per_minute_sqd +
FALSE total_night_charge_per_minute_sqd + total_intl_charge_per_minute_sqd
FALSE
FALSE Df Deviance AIC
FALSE - total_intl_charge_per_minute 1 6827.4 6867.4
FALSE - total_intl_charge_per_minute_sqd 1 6827.7 6867.7
FALSE <none> 6825.9 6867.9
FALSE - total_eve_calls 1 6828.6 6868.6
FALSE - total_eve_calls_sqd 1 6828.9 6868.9
FALSE - total_night_charge_per_minute_sqd 1 6829.4 6869.4
FALSE - total_day_calls 1 6829.4 6869.4
FALSE - total_night_charge_per_minute 1 6829.6 6869.6
FALSE - account_length 1 6829.8 6869.8
FALSE - total_day_calls_sqd 1 6831.2 6871.2
FALSE - number_vmail_messages_sqd 1 6832.4 6872.4
FALSE - number_vmail_messages 1 6836.5 6876.5
FALSE - total_day_charge_per_minute_sqd 1 6838.2 6878.2
FALSE - number_customer_service_calls 1 6850.3 6890.3
FALSE - total_intl_calls_sqd 1 6850.9 6890.9
FALSE - voice_mail_plan 1 6856.3 6896.3
FALSE - total_eve_charge_per_minute 1 6857.4 6897.4
FALSE - total_eve_charge_per_minute_sqd 1 6857.4 6897.4
FALSE - total_intl_calls 1 6857.4 6897.4
FALSE - number_customer_service_calls_sqd 1 6995.0 7035.0
FALSE - international_plan 1 7421.8 7461.8
FALSE
FALSE Step: AIC=6867.44
FALSE churn ~ account_length + international_plan + voice_mail_plan +
FALSE number_vmail_messages + total_day_calls + total_eve_calls +
FALSE total_intl_calls + number_customer_service_calls + total_eve_charge_per_minute +
FALSE total_night_charge_per_minute + number_vmail_messages_sqd +
FALSE total_day_calls_sqd + total_eve_calls_sqd + total_intl_calls_sqd +
FALSE number_customer_service_calls_sqd + total_day_charge_per_minute_sqd +
FALSE total_eve_charge_per_minute_sqd + total_night_charge_per_minute_sqd +
FALSE total_intl_charge_per_minute_sqd
FALSE
FALSE Df Deviance AIC
FALSE <none> 6827.4 6867.4
FALSE - total_eve_calls 1 6830.1 6868.1
FALSE - total_eve_calls_sqd 1 6830.4 6868.4
FALSE - total_night_charge_per_minute_sqd 1 6830.8 6868.8
FALSE - total_day_calls 1 6830.9 6868.9
FALSE - total_night_charge_per_minute 1 6830.9 6868.9
FALSE - account_length 1 6831.3 6869.3
FALSE - total_day_calls_sqd 1 6832.7 6870.7
FALSE - number_vmail_messages_sqd 1 6833.8 6871.8
FALSE - number_vmail_messages 1 6837.9 6875.9
FALSE - total_day_charge_per_minute_sqd 1 6839.9 6877.9
FALSE - total_intl_charge_per_minute_sqd 1 6843.0 6881.0
FALSE - total_intl_calls_sqd 1 6852.1 6890.1
FALSE - number_customer_service_calls 1 6852.2 6890.2
FALSE - voice_mail_plan 1 6857.6 6895.6
FALSE - total_intl_calls 1 6858.5 6896.5
FALSE - total_eve_charge_per_minute 1 6858.6 6896.6
FALSE - total_eve_charge_per_minute_sqd 1 6858.6 6896.6
FALSE - number_customer_service_calls_sqd 1 6996.7 7034.7
FALSE - international_plan 1 7421.9 7459.9
mod.log <- glm(s$formula, data = dftrain, family = binomial(link = "probit"))
predicted.log <- factor(ifelse(predict(mod.log, dftest[,-c(10)], type='response')>0.5,1,0))
confmat.log <- confusionMatrix(data=predicted.log, reference = dftest$churn, positive = '1')
CM_Function(confmat.log)

roc_score.log =roc(factor(dftest$churn, ordered=TRUE), factor(predicted.log, ordered=TRUE))
FALSE Setting levels: control = 0, case = 1
FALSE Setting direction: controls < cases
plot(roc_score.log ,main ="ROC curve")

Gaussian SVM
library(e1071)
mod.svm = svm(formula = churn ~ .,
data = dftrain,
type = 'C-classification', # this is because we want to make a regression classification
kernel = 'radial')
predicted.svm <- predict(mod.svm, dftest[,-c(10)])
confmat.svm <- confusionMatrix(data=predicted.svm, reference = dftest$churn, positive = '1')
CM_Function(confmat.svm)

roc_score.svm =roc(factor(dftest$churn, ordered=TRUE), factor(predicted.svm, ordered=TRUE))
FALSE Setting levels: control = 0, case = 1
FALSE Setting direction: controls < cases
plot(roc_score.svm ,main ="ROC curve")

Boosting
# library(gbm)
#
# mod <- gbm(churn ~.,
# data = dftrain,
# distribution = "gaussian",
# cv.folds = 10,
# shrinkage = .01,
# n.minobsinnode = 10,
# n.trees = 500)
#
# predicted <- factor(ifelse(1/(1+exp(-2*predict.gbm(mod, dftest[,-c(10)])))>=0.5,1,0))
#
#
# dftrain <- dftrain |>
# mutate_if(is.factor, as.character) |>
# mutate_if(is.character, as.numeric)
#
# dftest <- dftest |>
# mutate_if(is.factor, as.character) |>
# mutate_if(is.character, as.numeric)
#
# xgb_train <- xgb.DMatrix(data = as.matrix(dftrain[,-c(10)]), label = dftrain$churn)
# xgb_test <- xgb.DMatrix(data = as.matrix(dftest[,-c(10)]), label = dftest$churn)
# xgb_params <- list(
# booster = "gbtree",
# eta = 0.01,
# max_depth = 8,
# gamma = 4,
# subsample = 0.75,
# colsample_bytree = 1,
# objective = "multi:softprob",
# eval_metric = "mlogloss",
# num_class = 2)
#
# xgb_model <- xgb.train(
# params = xgb_params,
# data = xgb_train,
# nrounds = 100,
# verbose = 1
# )
#
# xgb_model
#
# xgb_preds <- predict(xgb_model, as.matrix(dftest$churn), reshape = TRUE)
# xgb_preds <- as.data.frame(xgb_preds)
# colnames(xgb_preds) <- c(0,1)
# predicted <- ifelse(xgb_preds[,2] > 0.5, 1, 0)
#
# CM_Function(confusionMatrix(data=predicted, reference = dftest$churn, positive = "1"))
#
# library(pROC)
# roc_score=roc(factor(dftest$churn, ordered=TRUE), factor(predicted, ordered=TRUE)) #AUC score
# plot(roc_score ,main ="ROC curve")
LDA
# LDA Model Fit on Training
#LDA_training <- lda(churn~., df_UNDER)
#LDA_training
# Density Plot for Overlapping
#p <- predict(LDA_training, df)
#ldahist(data = p$x[,1], g = df$churn, col = "#0D8387")
Confusion Matrix (Training VS Test)
# Training 60%
#LDA_predictions_training <- predict(LDA_training, df)$class
#LDA_Confusion_Matrix_training <- confusionMatrix(data = LDA_predictions_training, reference = df$churn, positive='yes')
#CM_Function(LDA_Confusion_Matrix_training)
Random Forest V2
# ROSE DATA
#rf <- randomForest(churn~., data=df_UNDER, proximity=TRUE)
#Predictions_OVER <- predict(rf, df_UNDER[,-10])
#CM_OVER <- confusionMatrix(Predictions_OVER, df_UNDER$churn, positive = 'yes')
#CM_Function(CM_OVER)
# ORIGINAL DATA
#Predictions_DATA <- predict(rf, df[,-10])
#CM_DATA <- confusionMatrix(Predictions_DATA, df$churn, positive = 'yes')
#CM_Function(CM_DATA)
Decision Trees
#tree <- rpart(churn ~., data = df)
#rpart.plot(tree)
#printcp(tree)
#plotcp(tree)
#Predictions_DT <- predict(tree, df[,-10])[,2]
#Predictions_DT <- ifelse(Predictions_DT > 0.5, "yes","no")
#Predictions_DT <- as.factor(Predictions_DT)
#CM_DATA <- confusionMatrix(Predictions_DT, df$churn, positive = 'yes')
#CM_Function(CM_DATA)
Neural Net
#library(caret)
#nn1 <- train(churn ~ ., data = df_UNDER, method = "nnet")
#nn1.pre <- predict(nn1, df[,-c(10)])
#confusionMatrix(nn1.pre, df$churn, positive = 'yes')
#CM_Function(confusionMatrix(nn1.pre, df$churn, positive = 'yes'))
Support Vector Machines
#library(e1071)
#classifierR = svm(formula = churn ~ .,
#data = dftrain,
#type = 'C-classification', # this is because we want to make a regression classification
#kernel = 'radial',
#cost = 100,
#gamma = 20)
#svm_1 <- predict(object = classifierR, newdata = dftest[,-c(10)])
#confusionMatrix(svm_1, dftest$churn, positive = '1')
#CM_Function(confusionMatrix(svm_1, df$churn, positive = '1'))